import numpy as np
import pandas as pd
import os
import re
import tensorflow as tf
from threading import Thread
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201, Xception
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
init_notebook_mode(connected=True)
images_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/images.csv")
styles_df = pd.read_csv("../input/fashion-product-images-dataset/fashion-dataset/styles.csv", on_bad_lines='skip')
images_df['id'] = images_df['filename'].apply(lambda x: x.replace(".jpg","")).astype(int)
images_df
data = styles_df.merge(images_df,on='id',how='left').reset_index(drop=True)
data['filename'] = data['filename'].apply(lambda x: os.path.join("../input/fashion-product-images-dataset/fashion-dataset/images/",x))
image_files = os.listdir("../input/fashion-product-images-dataset/fashion-dataset/images")
data['file_found'] = data['id'].apply(lambda x: f"{x}.jpg" in image_files)
data = data[data['file_found']].reset_index(drop=True)
data.head()
data.isnull().sum()
fig = px.bar(data.groupby('masterCategory').count().reset_index(), x='masterCategory',y='id',title='Count per Product Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig = px.bar(data.groupby('subCategory').count().reset_index(), x='subCategory',y='id',title='Count per Product Sub-category', color='subCategory')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig = px.bar(data.groupby('season').count().reset_index(), x='season', y='id', title='Count per Season Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig = px.bar(data.groupby('usage').count().reset_index(), x='usage', y='id', title='Count per Usage Category')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
data.drop(columns=['productDisplayName','link','file_found'],inplace=True)
data
data = data.sample(frac=1).reset_index(drop=True)
n = len(data)
train = data.iloc[:int(n*0.8),:]
val = data.iloc[int(n*0.8):,:].reset_index(drop=True)
datagen = ImageDataGenerator(rescale=1/255.)
train_generator = datagen.flow_from_dataframe(dataframe=train,
target_size=(256,256),
x_col='filename',
class_mode=None,
batch_size=32,
shuffle=False,
classes=['images'])
val_generator = datagen.flow_from_dataframe(dataframe=val,
target_size=(256,256),
x_col='filename',
class_mode=None,
batch_size=32,
shuffle=False,
classes=['images'])
base_model = VGG16(include_top=False,input_shape=(256,256,3))
model = Sequential()
for layer in base_model.layers:
model.add(layer)
model.add(GlobalAveragePooling2D())
model.summary()
train_features = model.predict(train_generator,verbose=1)
val_features = model.predict(val_generator,verbose=1)
from sklearn.decomposition import PCA

pca = PCA(2)
pca.fit(train_features)
train_pca = pca.transform(train_features)
test_pca = pca.fit_transform(val_features)
train_pca = pd.DataFrame(train_pca)
train = train.iloc[:,0:10]
train = train.merge(train_pca, how='left', left_index=True, right_index=True)
fig = px.scatter(train, x=0, y=1, color="masterCategory", title='Main Category', height=600, labels={
"0": "Principal Component 1",
"1": "Principal Component 2"})
fig.show()
fig = px.scatter(train, x=0, y=1, color="gender", title='Gender', height=600, labels={
"0": "Principal Component 1",
"1": "Principal Component 2"})
fig.show()
fig = px.scatter(train, x=0, y=1, color="subCategory", title='Sub Category', height=600, labels={
"0": "Principal Component 1",
"1": "Principal Component 2"})
fig.show()
fig = px.scatter(train[train['season'].notna()], x=0, y=1, color="season", title='Season', height=600, labels={
"0": "Principal Component 1",
"1": "Principal Component 2"})
fig.show()
fig = px.scatter(train[train['usage'].notna()], x=0, y=1, color="usage", title='Usage', height=600, labels={
"0": "Principal Component 1",
"1": "Principal Component 2"})
fig.show()
Inference:
pca = PCA()
pca.fit(train_features)
train_pca = pca.transform(train_features)
variance_explained = np.cumsum(pca.explained_variance_ratio_)
pcs = range(1,len(variance_explained)+1)
px.line(x = pcs, y = variance_explained, title = 'Principal Components Cumulative Explained Variance', height=600, labels={
"x": "Principal Components",
"y": "Explained Variance"})
val_pca = pca.fit_transform(val_features)[:,:313]
val_pca = pd.DataFrame(val_pca)
val = val.iloc[:,0:10]
val = val.merge(val_pca, how='left', left_index=True, right_index=True)
X = val.iloc[:,-313:]
y = val['id']
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=6)
neigh.fit(X, y)
def read_img(image_path):
image = load_img(image_path,target_size=(256,256,3))
image = img_to_array(image)
image = image/255.
return image
import random
for _ in range(10):
i = random.randint(1,len(val))
img1 = read_img(val.loc[i,'filename'])
dist, index = neigh.kneighbors(X=X.iloc[i,:].values.reshape(1,-1))
plt.figure(figsize = (4 , 4))
plt.imshow(img1)
plt.title("Input Image")
plt.figure(figsize = (20 , 20))
for i in range(1,6):
plt.subplot(1 , 5, i)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.3)
image = read_img(val.loc[index[0][i],'filename'])
plt.imshow(image)
plt.title(f'Similar Product #{i}')